package com.tb;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.junit.Test;
import org.w3c.dom.Document;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;

/**
 * comment
 *
 * @author zhao yuan
 * @since 2018/3/27
 */
public class XpathTest {

    @Test
    public void test() throws IOException, XPathExpressionException, ParserConfigurationException {
        String xpath="//div[@class='la_con']/text()";
        String url = "http://world.huanqiu.com/article/2018-03/11700368.html?qq-pf-to=pcqq.c2c";

        Connection connect = Jsoup.connect(url);
        String html = connect.get().body().html();
        HtmlCleaner hc = new HtmlCleaner();
        TagNode tn = hc.clean(html);
        Document document = new DomSerializer(new CleanerProperties()).createDOM(tn);
        XPath matcher = XPathFactory.newInstance().newXPath();
        Object evaluate = matcher.evaluate(xpath, document, XPathConstants.NODESET);
        System.out.println(evaluate);

    }
}
